library(Seurat)
library(SeuratObject)
library(SeuratDisk)
library(tidyverse)
library(patchwork)
library(harmony)

First give a try to integrate all the human wound and mouse wound data

1. Prepare wound data

# load the human and mouse converted gene symbols (made by myself)
hs_ms_genes <- data.table::fread("Private_human_mouse_GeneSymbol_conversion.txt")

# Day 3 after wounding (Dongqing Snhg26)
ms_seu <- readRDS("../s1-scData/01-our ms data/s2_miceD3data_onlyWT.rds")
## convert the mouse gene symbol to human gene symbol (orthology)
exp_mtx <- as.matrix(ms_seu@assays$RNA@counts);dim(exp_mtx)
v2genes <- data.frame(ms_gene = rownames(exp_mtx)) %>% left_join(., hs_ms_genes)

table(is.na(v2genes$hs_gene))
##lots of NAs for which there are no human genes matching
## Remove NAs
v2genes <- v2genes[!is.na(v2genes$hs_gene),,F]
## Filter the expression matrix for genes which a mouse counterpart is available
exp_mtx <- exp_mtx[v2genes$ms_gene,]
## Now change the rownames of the matrix to the mouse gene names
rownames(exp_mtx) <- v2genes$hs_gene
#check the duplicated gene names
duplicated(rownames(exp_mtx)) %>% table()
exp_mtx <- exp_mtx[!duplicated(rownames(exp_mtx)),]
dim(exp_mtx);identical(colnames(exp_mtx), colnames(ms_seu))
## Create the seurat object with mouse genes.
ms_seu <- CreateSeuratObject(counts = exp_mtx, meta.data = ms_seu@meta.data)
rownames(ms_seu)[grep("KRT6", rownames(ms_seu))]
rownames(ms_seu)[grep("MMP", rownames(ms_seu))]

saveRDS(ms_seu, file = "../s1-scData/01-our ms data/s2_miceD3data_onlyWT_humanGenes.rds")

2. Integrate the human and mouse wound data

# mouse wound with human gene symbol (own data)
ms_seu <- readRDS("../all_miceD3data_onlyWT_humanGenes.rds")
ms_seu$Condition <- gsub("Skin", "Unwound", ms_seu$Condition) # Change the name since it is the same as human wound
ms_seu$Condition <- gsub("Wound", "DPW3", ms_seu$Condition)
ms_seu <- subset(ms_seu, subset = Condition == "DPW3")
table(ms_seu$orig.ident)
## 
## WTwound 
##    4601
rownames(ms_seu)[grep("^IL", rownames(ms_seu))] %>% sort()
##  [1] "IL10"     "IL10RA"   "IL10RB"   "IL11"     "IL11RA"   "IL12A"   
##  [7] "IL12B"    "IL12RB1"  "IL12RB2"  "IL13"     "IL13RA1"  "IL13RA2" 
## [13] "IL15"     "IL15RA"   "IL16"     "IL17B"    "IL17D"    "IL17F"   
## [19] "IL17RA"   "IL17RB"   "IL17RC"   "IL17RD"   "IL17RE"   "IL18"    
## [25] "IL18BP"   "IL18R1"   "IL18RAP"  "IL19"     "IL1A"     "IL1B"    
## [31] "IL1F10"   "IL1R1"    "IL1R2"    "IL1RAP"   "IL1RAPL1" "IL1RL1"  
## [37] "IL1RL2"   "IL1RN"    "IL2"      "IL20"     "IL20RA"   "IL20RB"  
## [43] "IL21R"    "IL22RA1"  "IL22RA2"  "IL23A"    "IL23R"    "IL24"    
## [49] "IL27"     "IL27RA"   "IL2RA"    "IL2RB"    "IL2RG"    "IL31RA"  
## [55] "IL33"     "IL34"     "IL3RA"    "IL4I1"    "IL4R"     "IL5"     
## [61] "IL6"      "IL6R"     "IL6ST"    "IL7"      "IL7R"     "IL9R"    
## [67] "ILDR1"    "ILDR2"    "ILF2"     "ILF3"     "ILK"      "ILKAP"   
## [73] "ILRUN"    "ILVBL"
# Human skin wound healing 
hs_seu <- readRDS("../all_human_wound_metadata.rds")
#hs_seu <- subset(hs_seu, subset = Condition == "Skin" | Condition == "Wound1")
hs_seu <- subset(hs_seu, subset = Condition == "Wound1")
hs_seu@meta.data <- droplevels(hs_seu@meta.data)
table(hs_seu$orig.ident)
## 
## PWH26D1 PWH27D1 PWH28D1 
##    4662    6300    4484
rownames(hs_seu)[grep("^IL", rownames(hs_seu))] %>% sort()
##  [1] "IL10"       "IL10RA"     "IL10RB"     "IL10RB-DT"  "IL11"      
##  [6] "IL11RA"     "IL12A"      "IL12A-AS1"  "IL12B"      "IL12RB1"   
## [11] "IL12RB2"    "IL13"       "IL13RA1"    "IL13RA2"    "IL15"      
## [16] "IL15RA"     "IL16"       "IL17A"      "IL17B"      "IL17D"     
## [21] "IL17RA"     "IL17RB"     "IL17RC"     "IL17RD"     "IL17RE"    
## [26] "IL18"       "IL18BP"     "IL18R1"     "IL18RAP"    "IL19"      
## [31] "IL1A"       "IL1B"       "IL1F10"     "IL1R1"      "IL1R2"     
## [36] "IL1RAP"     "IL1RAPL1"   "IL1RAPL2"   "IL1RL1"     "IL1RL2"    
## [41] "IL1RN"      "IL2"        "IL20"       "IL20RA"     "IL20RB"    
## [46] "IL20RB-AS1" "IL21R"      "IL21R-AS1"  "IL22"       "IL22RA1"   
## [51] "IL22RA2"    "IL23A"      "IL23R"      "IL24"       "IL25"      
## [56] "IL26"       "IL27"       "IL27RA"     "IL2RA"      "IL2RB"     
## [61] "IL2RG"      "IL31RA"     "IL32"       "IL33"       "IL34"      
## [66] "IL36B"      "IL36G"      "IL36RN"     "IL37"       "IL3RA"     
## [71] "IL4"        "IL4I1"      "IL4R"       "IL5"        "IL5RA"     
## [76] "IL6"        "IL6-AS1"    "IL6R"       "IL6R-AS1"   "IL6ST"     
## [81] "IL7"        "IL7R"       "IL9R"       "ILDR1"      "ILDR2"     
## [86] "ILF2"       "ILF3"       "ILF3-DT"    "ILK"        "ILKAP"     
## [91] "ILRUN"      "ILRUN-AS1"  "ILVBL"      "ILVBL-AS1"
# check the overlapped genes and keep the overlapped genes
overgene <- intersect(rownames(ms_seu), rownames(hs_seu))
length(overgene)
## [1] 14934
ms_seu <- ms_seu[overgene, ]
hs_seu <- hs_seu[overgene, ]

# integrate based on orig.ident names
ms_seu$Species="Mouse"
hs_seu$Species="Human"

3. sctransform normalization

all_seu <- merge(hs_seu,  y = c(ms_seu),
                project = "wounds")

# sampling the cell numbers 
lowNum <- min(table(all_seu$Species))
metadata <- all_seu@meta.data %>% rownames_to_column(var = "barcode")
to_keep <- metadata %>% group_by(Species) %>% sample_n(size = lowNum, replace = FALSE) %>%
    pull(barcode)
all_seu <- subset(all_seu, cells = to_keep)
table(all_seu$Condition)
## 
##   DPW3 Wound1 
##   4601   4601
alldata = SplitObject(all_seu, split.by = "orig.ident")
rm(all_seu)
# normalize and identify variable features for each dataset independently
alldata <- lapply(X = alldata, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 4000)
})

4. Run CCA Clustering

# Run CCA using defaults
features <- SelectIntegrationFeatures(object.list = alldata, nfeatures = 3000)
anno <- readRDS("../gene_conversion_hs_ms/humanAnnotation.rds")
features <- features %>% as.data.frame() %>% setNames("gene") %>% left_join(., anno[, c(1,3)], by=c("gene"="external_gene_name")) %>% 
  filter(gene_biotype == "protein_coding" | gene_biotype == "lncRNA") %>% distinct(gene) %>% pull(gene)

rn <- lapply(alldata, rownames) 
nE <- colSums(Reduce(rbind, lapply(rn, function(x) features %in% x))) 
features <- features[nE == length(alldata)] #Keep only variable genes expressed in all samples
length(features)
## [1] 2990
alldata.anchors <- FindIntegrationAnchors(object.list = alldata, anchor.features = features)
# this command creates an 'integrated' data assay
inteData <- IntegrateData(anchorset = alldata.anchors)

# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(inteData) <- "integrated"
VariableFeatures(inteData) <- features

# Run the standard workflow for visualization and clustering
inteData <- ScaleData(inteData, verbose = FALSE)
inteData <- RunPCA(inteData, npcs = 30, verbose = FALSE)

inteData <- RunUMAP(inteData, dims = 1:30, assay = "integrated", reduction = "pca", n.neighbors = 30,
                    min.dist = 0.5, n.epochs = 500, spread = 1, learning.rate = 1)

DimPlot(inteData, group.by = "orig.ident") + NoAxes()

inteData = FindNeighbors(inteData, dims = 1:30, reduction = "pca", k.param = 30)
inteData = FindClusters(inteData, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9202
## Number of edges: 487989
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9446
## Number of communities: 18
## Elapsed time: 1 seconds
inteData = FindClusters(inteData, resolution = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9202
## Number of edges: 487989
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8992
## Number of communities: 29
## Elapsed time: 1 seconds
inteData = FindClusters(inteData, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9202
## Number of edges: 487989
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9287
## Number of communities: 24
## Elapsed time: 1 seconds
inteData = FindClusters(inteData, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 9202
## Number of edges: 487989
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9102
## Number of communities: 27
## Elapsed time: 1 seconds
#inteData$Condition <- factor(inteData$Condition, levels = c("Skin", "Wound1", "Unwound", "DPW3"))
inteData$Condition <- factor(inteData$Condition, levels = c("Wound1",  "DPW3"))

DefaultAssay(inteData) <- "RNA"
inteData <- NormalizeData(inteData)

5. CellType plotting

DimPlot(inteData, group.by = "Species", cols = c("#d8b365", "#5ab4ac")) + ggtitle("Cross species integration")

DimPlot(inteData, group.by = "Species", split.by = "Species", cols = c("#d8b365", "#5ab4ac")) + ggtitle("Cross species integration")

DimPlot(inteData, group.by = "CellType", label = T, split.by = "Species") + ggtitle("")

DimPlot(inteData, group.by = "CellType", label = T, split.by = "Condition", ncol = 4) + ggtitle("") + NoLegend()

DimPlot(inteData, group.by = "integrated_snn_res.0.3", label = T, split.by = "Condition")

DimPlot(inteData, group.by = "integrated_snn_res.0.5", label = T, split.by = "Condition")

DimPlot(inteData, group.by = "integrated_snn_res.0.8", label = T, split.by = "Condition")

DimPlot(inteData, group.by = "integrated_snn_res.1", label = T, split.by = "Condition")

6. Data exporting

saveRDS(inteData, "all_HsMs_CCA_W1_DPW3_sampling_noSkin.rds")
inteData <- readRDS("all_HsMs_CCA_W1_DPW3_noSkin.rds")
DimPlot(inteData, group.by = "CellType", label = T, split.by = "Condition", ncol = 4) + ggtitle("") + NoLegend()
table(inteData$CellType, inteData$integrated_snn_res.0.3)

Pie chart

###############################################
#-- Cell proportion analysis across species --#
## Step 1. calculate the cell types number per sample, add the location information, 
## total numbers, as well the condition information
metadata <- inteData@meta.data
cl_species <- table(metadata$Condition, metadata$Species) %>% as.data.frame() %>% dplyr::filter(Freq > 0)

sm_ct_nu <- table(metadata$Condition, metadata$integrated_snn_res.0.3) %>% as.data.frame() %>% 
  setNames(c("Sample", "CellCluster", "Num")) %>%
  left_join(., cl_species, by = c("Sample" = "Var1"))  %>% mutate(prop = Num / Freq)

## step 2. calculate the total normalized proportions of each cell type per condition
df.group <- sm_ct_nu %>% group_by(CellCluster, Var2) %>% summarise(Freq2=sum(prop)) %>% 
  ungroup() %>% group_by(CellCluster) %>% mutate(Freq_new = Freq2/sum(Freq2), lbl = scales::percent(Freq_new)) %>% ungroup()

ggplot(data=df.group, aes(x=" ", y=Freq_new, group=Var2, colour=Var2, fill=Var2)) +
  geom_bar(width = 1, stat = "identity") +
  scale_fill_manual(values = c("#c45027", "#00afbb")) +
  scale_color_manual(values = c("#c45027", "#00afbb")) +
  coord_polar("y", start=0) + 
  facet_grid(CellCluster ~ .) + theme_void()

pdf("PieChart_res0.05.pdf", useDingbats = FALSE, width = 3, height = 6)
dev.off()

SessionInfo

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS:   /sw/apps/R/4.1.1/rackham/lib64/R/lib/libRblas.so
## LAPACK: /sw/apps/R/4.1.1/rackham/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] harmony_0.1.0         Rcpp_1.0.7            patchwork_1.1.1      
##  [4] forcats_0.5.1         stringr_1.4.0         dplyr_1.0.7          
##  [7] purrr_0.3.4           readr_2.0.2           tidyr_1.1.4          
## [10] tibble_3.1.5          ggplot2_3.3.5         tidyverse_1.3.1      
## [13] SeuratDisk_0.0.0.9019 SeuratObject_4.0.2    Seurat_4.0.5         
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1          backports_1.3.0       plyr_1.8.6           
##   [4] igraph_1.2.7          lazyeval_0.2.2        splines_4.1.1        
##   [7] listenv_0.8.0         scattermore_0.7       digest_0.6.28        
##  [10] htmltools_0.5.2       fansi_0.5.0           magrittr_2.0.1       
##  [13] tensor_1.5            cluster_2.1.2         ROCR_1.0-11          
##  [16] tzdb_0.1.2            globals_0.14.0        modelr_0.1.8         
##  [19] matrixStats_0.61.0    spatstat.sparse_2.0-0 colorspace_2.0-2     
##  [22] rvest_1.0.2           ggrepel_0.9.1         haven_2.4.3          
##  [25] xfun_0.28             crayon_1.4.2          jsonlite_1.7.2       
##  [28] spatstat.data_2.1-0   survival_3.2-13       zoo_1.8-9            
##  [31] glue_1.5.0            polyclip_1.10-0       gtable_0.3.0         
##  [34] leiden_0.3.9          future.apply_1.8.1    abind_1.4-5          
##  [37] scales_1.1.1          DBI_1.1.1             miniUI_0.1.1.1       
##  [40] viridisLite_0.4.0     xtable_1.8-4          reticulate_1.22      
##  [43] spatstat.core_2.3-1   bit_4.0.4             htmlwidgets_1.5.4    
##  [46] httr_1.4.2            RColorBrewer_1.1-2    ellipsis_0.3.2       
##  [49] ica_1.0-2             farver_2.1.0          pkgconfig_2.0.3      
##  [52] sass_0.4.0            uwot_0.1.10           dbplyr_2.1.1         
##  [55] deldir_1.0-6          utf8_1.2.2            labeling_0.4.2       
##  [58] tidyselect_1.1.1      rlang_0.4.12          reshape2_1.4.4       
##  [61] later_1.3.0           munsell_0.5.0         cellranger_1.1.0     
##  [64] tools_4.1.1           cli_3.1.0             generics_0.1.1       
##  [67] broom_0.7.10          ggridges_0.5.3        evaluate_0.14        
##  [70] fastmap_1.1.0         yaml_2.2.1            goftest_1.2-3        
##  [73] knitr_1.36            bit64_4.0.5           fs_1.5.0             
##  [76] fitdistrplus_1.1-6    RANN_2.6.1            pbapply_1.5-0        
##  [79] future_1.23.0         nlme_3.1-153          mime_0.12            
##  [82] xml2_1.3.2            hdf5r_1.3.4           compiler_4.1.1       
##  [85] rstudioapi_0.13       plotly_4.10.0         png_0.1-7            
##  [88] spatstat.utils_2.2-0  reprex_2.0.1          bslib_0.3.1          
##  [91] stringi_1.7.5         highr_0.9             RSpectra_0.16-0      
##  [94] lattice_0.20-45       Matrix_1.3-4          vctrs_0.3.8          
##  [97] pillar_1.6.4          lifecycle_1.0.1       spatstat.geom_2.3-0  
## [100] lmtest_0.9-39         jquerylib_0.1.4       RcppAnnoy_0.0.19     
## [103] data.table_1.14.2     cowplot_1.1.1         irlba_2.3.3          
## [106] httpuv_1.6.3          R6_2.5.1              promises_1.2.0.1     
## [109] KernSmooth_2.23-20    gridExtra_2.3         parallelly_1.28.1    
## [112] codetools_0.2-18      MASS_7.3-54           assertthat_0.2.1     
## [115] withr_2.4.2           sctransform_0.3.2     mgcv_1.8-38          
## [118] parallel_4.1.1        hms_1.1.1             grid_4.1.1           
## [121] rpart_4.1-15          rmarkdown_2.11        Rtsne_0.15           
## [124] shiny_1.7.1           lubridate_1.8.0